Loading packages

In [1]:
from genepy.utils import helper as h

# to comment in your case
from taigapy import TaigaClient
tc = TaigaClient()

from celligner import Celligner
import pandas as pd
# to comment in your case
from depmapomics import tracker as track
#autoreload
%load_ext autoreload
%autoreload 2
#output
from bokeh.plotting import output_notebook
output_notebook()

from celligner.params import TISSUE_COLOR
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: 
Compilation is falling back to object mode WITH looplifting enabled because Function "l2_norm" failed type inference due to: No implementation of function Function(<function norm at 0x7f2e97d29a60>) found for signature:
 
 >>> norm(x=array(float32, 2d, A), axis=Literal[int](1))
 
There are 2 candidate implementations:
  - Of which 2 did not match due to:
  Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2352.
    With argument(s): '(x=array(float32, 2d, A), axis=int64)':
   Rejected as the implementation raised a specific error:
     TypeError: norm_impl() got an unexpected keyword argument 'x'
  raised from /home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/typing/templates.py:722

During: resolving callee type: Function(<function norm at 0x7f2e97d29a60>)
During: typing of call at /home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py (16)


File "celligner/mnnpy/mnnpy/utils.py", line 16:
def l2_norm(in_matrix):
    return np.linalg.norm(x=in_matrix, axis=1)
    ^

  @jit(float32[:](float32[:, :]), nogil=True)
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:152: NumbaWarning: Function "l2_norm" was compiled in object mode without forceobj=True.

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:162: NumbaDeprecationWarning: 
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: Code running in object mode won't allow parallel execution despite nogil=True.
  @jit(float32[:](float32[:, :]), nogil=True)
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:29: NumbaPerformanceWarning: np.dot() is faster on contiguous arrays, called on (array(float32, 1d, A), array(float32, 1d, A))
  dist[i, j] = np.dot(m[i], n[j])
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:197: NumbaWarning: 
Compilation is falling back to object mode WITH looplifting enabled because Function "adjust_s_variance" failed type inference due to: NameError: name 'sq_dist_to_line' is not defined
  @jit(float32(float32[:, :], float32[:, :], float32[:], float32[:], float32), nogil=True)
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:197: NumbaWarning: 
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "adjust_s_variance" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "celligner/mnnpy/mnnpy/utils.py", line 205:
def adjust_s_variance(data1, data2, curcell, curvect, sigma):
    <source elided>
    totalprob2 = 0.
    for samecell in data2:
    ^

  @jit(float32(float32[:, :], float32[:, :], float32[:], float32[:], float32), nogil=True)
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:152: NumbaWarning: Function "adjust_s_variance" was compiled in object mode without forceobj=True, but has lifted loops.

File "celligner/mnnpy/mnnpy/utils.py", line 199:
def adjust_s_variance(data1, data2, curcell, curvect, sigma):
    distance1 = np.zeros((data1.shape[0], 2), dtype=np.float32)
    ^

  state.func_ir.loc))
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:162: NumbaDeprecationWarning: 
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "celligner/mnnpy/mnnpy/utils.py", line 199:
def adjust_s_variance(data1, data2, curcell, curvect, sigma):
    distance1 = np.zeros((data1.shape[0], 2), dtype=np.float32)
    ^

  state.func_ir.loc))
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:197: NumbaWarning: Code running in object mode won't allow parallel execution despite nogil=True.
  @jit(float32(float32[:, :], float32[:, :], float32[:], float32[:], float32), nogil=True)
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:236: NumbaPerformanceWarning: np.dot() is faster on contiguous arrays, called on (array(float32, 1d, C), array(float32, 1d, A))
  scale = np.dot(working, grad)
Loading BokehJS ...

Loading expression files

In [2]:
# load from taiga public (figshare link)
# load internal expression,
# latest version can be found at https://depmap.org/portal/download/
# can also be loaded like so pd.read_csv('gs://ccle_default_params/celligner_ex/CCLE_expression.csv.gz', index_col=0)
CCLE_expression = tc.get(name='internal-21q3-fe4c',
                         file='CCLE_expression_full')  


# load  TCGA expression
# this dataset was generated from  ,using this script: 
# caan be found here: pd.read_csv('gs://ccle_default_params/celligner_ex/TCGA_expression.csv.gz', index_col=0)
TCGA_expression = tc.get(name='celligner-input-9827',
                         file='tumor_expression')
No dataset version provided. Using version 16.
No dataset version provided. Using version 1.
In [3]:
# subset gene names to ensembl ids only
CCLE_expression = CCLE_expression[CCLE_expression.columns[:-92]]
CCLE_expression.columns = list(map(lambda x: x.split(
    ' (')[1][:-1] if ' (' in x else x, CCLE_expression.columns))

common = set(CCLE_expression.columns).intersection(
    set(TCGA_expression.columns))
CCLE_expression = CCLE_expression[list(common)]
TCGA_expression = TCGA_expression[list(common)]

Managing annotations

In [5]:
# loading annotations
CCLE_annotation = track.getTracker() # the function uses pygsheets to load this: REFSHEET_URL=https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY
# Sheets.from_files(MY_ID, MYSTORAGE_ID).get(REFSHEET_URL).sheets[0].to_frame(index_col=0)
# you can also get it from pd.read_csv('gs://ccle_default_params/celligner_ex/CCLE_annotation.csv.gz', index_col=0)


# can be loaded from 
# pd.read_csv('gs://ccle_default_params/celligner_ex/TCGA_annotation.csv.gz', index_col=0)
TCGA_annotation = tc.get(name='celligner-input-9827',
                         file='tumor_annotations') # generated manually 
No dataset version provided. Using version 1.
In [6]:
# transforming annotations
CCLE_annotation = CCLE_annotation.drop_duplicates('arxspan_id').set_index("arxspan_id")
CCLE_annotation = CCLE_annotation.loc[CCLE_expression.index, ["origin", 'subtype']].rename(columns={"origin": "tissue_type", "subtype": 'disease_type'})
CCLE_annotation["cell_type"] = "cancer cell line"

TCGA_annotation = TCGA_annotation.set_index("sampleID").loc[TCGA_expression.index,["lineage",
"subtype"]].rename(columns={"lineage":"tissue_type", "subtype": 'disease_type'})
TCGA_annotation['cell_type'] = "tumor sample"
In [7]:
# some name are not consistent between the two datasets
rename = {np.nan: "unknown", "adrenal_cortex": "adrenal", "colorectal": "colon", 'thymus': 'thyroid',}
CCLE_annotation = CCLE_annotation.replace({"tissue_type": rename})
TCGA_annotation = TCGA_annotation.replace({"tissue_type": rename})

Fitting celliner with the CCLE dataset

In [ ]:
# issues when rerunning celligner
In [8]:
my_alligner = Celligner(make_plots=True)
my_alligner.fit(CCLE_expression, CCLE_annotation)
fetching gene names from biomart cache
using only usefull genes
looking at 1411 samples.
found 29593 common genes
creating a fit dataset..
reducing dimensionality...
clustering...
WARNING: You’re trying to run this on 29593 dimensions of `.X`, if you really want this, set `use_rep='X'`.
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters
running differential expression on 37 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
done
Out[8]:
<celligner.Celligner at 0x7f2d8d7e39e8>
In [10]:
# running with regular mnn
my_alligner.method = "mnn"
_ = my_alligner.transform(TCGA_expression, TCGA_annotation)
looking at 12236 samples.
found 29593 common genes
creating a transform input..
reducing dimensionality...
clustering..
WARNING: You’re trying to run this on 70 dimensions of `.X`, if you really want this, set `use_rep='X'`.
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters..
running differential expression on 58 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
there is 0.398 overlap between the fit and transform dataset in their most variable genes
doing cPCA..
transform
regressing out the cPCA components..
doing the MNN analysis using scanPy MNN...
Performing cosine normalization...
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: 
Compilation is falling back to object mode WITH looplifting enabled because Function "l2_norm" failed type inference due to: No implementation of function Function(<function norm at 0x7ffb24068598>) found for signature:
 
 >>> norm(x=array(float32, 2d, A), axis=Literal[int](1))
 
There are 2 candidate implementations:
    - Of which 2 did not match due to:
    Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2352.
      With argument(s): '(x=array(float32, 2d, A), axis=int64)':
     Rejected as the implementation raised a specific error:
       TypeError: norm_impl() got an unexpected keyword argument 'x'
  raised from /home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/typing/templates.py:722

During: resolving callee type: Function(<function norm at 0x7ffb24068598>)
During: typing of call at /home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py (16)


File "celligner/mnnpy/mnnpy/utils.py", line 16:
def l2_norm(in_matrix):
    return np.linalg.norm(x=in_matrix, axis=1)
    ^

  @jit(float32[:](float32[:, :]), nogil=True)
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:152: NumbaWarning: Function "l2_norm" was compiled in object mode without forceobj=True.

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:162: NumbaDeprecationWarning: 
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: Code running in object mode won't allow parallel execution despite nogil=True.
  @jit(float32[:](float32[:, :]), nogil=True)
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: 
Compilation is falling back to object mode WITH looplifting enabled because Function "l2_norm" failed type inference due to: No implementation of function Function(<function norm at 0x7ffb24068598>) found for signature:
 
 >>> norm(x=array(float32, 2d, A), axis=Literal[int](1))
 
There are 2 candidate implementations:
    - Of which 2 did not match due to:
    Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2352.
      With argument(s): '(x=array(float32, 2d, A), axis=int64)':
     Rejected as the implementation raised a specific error:
       TypeError: norm_impl() got an unexpected keyword argument 'x'
  raised from /home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/typing/templates.py:722

During: resolving callee type: Function(<function norm at 0x7ffb24068598>)
During: typing of call at /home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py (16)


File "celligner/mnnpy/mnnpy/utils.py", line 16:
def l2_norm(in_matrix):
    return np.linalg.norm(x=in_matrix, axis=1)
    ^

  @jit(float32[:](float32[:, :]), nogil=True)
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:152: NumbaWarning: Function "l2_norm" was compiled in object mode without forceobj=True.

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:162: NumbaDeprecationWarning: 
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: Code running in object mode won't allow parallel execution despite nogil=True.
  @jit(float32[:](float32[:, :]), nogil=True)
Starting MNN correct iteration. Reference batch: 0
Step 1 of 1: processing batch 1
  Looking for MNNs...
found 10135 mnns..
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
MNN correction complete. Gathering output...
done
reducing dimensionality...
> /home/jeremie/celligner/celligner/__init__.py(630)plot()
    629     import ipdb; ipdb.set_trace()
--> 630     if 'colors' not in plot_kwargs:
    631       if show_clusts:

ipdb> c
[1, 1, 1, 1, 1, 1, 1, 1, 1]
making plot...
In [29]:
# you can rerun it with different parameters
my_alligner.method = "mnn"
_ = my_alligner.transform(_rerun=False)
reducing dimensionality...
doing differential expression analysis on the clusters..
regressing out the cPCA components..
doing the MNN analysis using scanPy MNN...
Performing cosine normalization...
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: 
Compilation is falling back to object mode WITH looplifting enabled because Function "l2_norm" failed type inference due to: No implementation of function Function(<function norm at 0x7f48a80eaa60>) found for signature:
 
 >>> norm(x=array(float32, 2d, A), axis=Literal[int](1))
 
There are 2 candidate implementations:
    - Of which 2 did not match due to:
    Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2352.
      With argument(s): '(x=array(float32, 2d, A), axis=int64)':
     Rejected as the implementation raised a specific error:
       TypeError: norm_impl() got an unexpected keyword argument 'x'
  raised from /home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/typing/templates.py:722

During: resolving callee type: Function(<function norm at 0x7f48a80eaa60>)
During: typing of call at /home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py (16)


File "celligner/mnnpy/mnnpy/utils.py", line 16:
def l2_norm(in_matrix):
    return np.linalg.norm(x=in_matrix, axis=1)
    ^

  @jit(float32[:](float32[:, :]), nogil=True)
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:152: NumbaWarning: Function "l2_norm" was compiled in object mode without forceobj=True.

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:162: NumbaDeprecationWarning: 
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: Code running in object mode won't allow parallel execution despite nogil=True.
  @jit(float32[:](float32[:, :]), nogil=True)
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: 
Compilation is falling back to object mode WITH looplifting enabled because Function "l2_norm" failed type inference due to: No implementation of function Function(<function norm at 0x7f48a80eaa60>) found for signature:
 
 >>> norm(x=array(float32, 2d, A), axis=Literal[int](1))
 
There are 2 candidate implementations:
    - Of which 2 did not match due to:
    Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2352.
      With argument(s): '(x=array(float32, 2d, A), axis=int64)':
     Rejected as the implementation raised a specific error:
       TypeError: norm_impl() got an unexpected keyword argument 'x'
  raised from /home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/typing/templates.py:722

During: resolving callee type: Function(<function norm at 0x7f48a80eaa60>)
During: typing of call at /home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py (16)


File "celligner/mnnpy/mnnpy/utils.py", line 16:
def l2_norm(in_matrix):
    return np.linalg.norm(x=in_matrix, axis=1)
    ^

  @jit(float32[:](float32[:, :]), nogil=True)
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:152: NumbaWarning: Function "l2_norm" was compiled in object mode without forceobj=True.

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/object_mode_passes.py:162: NumbaDeprecationWarning: 
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "celligner/mnnpy/mnnpy/utils.py", line 15:
@jit(float32[:](float32[:, :]), nogil=True)
def l2_norm(in_matrix):
^

  state.func_ir.loc))
/home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py:14: NumbaWarning: Code running in object mode won't allow parallel execution despite nogil=True.
  @jit(float32[:](float32[:, :]), nogil=True)
Starting MNN correct iteration. Reference batch: 0
Step 1 of 1: processing batch 1
  Looking for MNNs...
found 10136 mnns..
  Computing correction vectors...
  Adjusting variance...
  Applying correction...
MNN correction complete. Gathering output...
done
reducing dimensionality...
making plot...
In [13]:
my_alligner.umap_kwargs
Out[13]:
{'n_neighbors': 10, 'min_dist': 0.5, 'metric': 'euclidean', 'n_components': 2}

you can change the parameters and plot it again (here coloring per tissue type)

In [31]:
my_alligner.plot(color_column="tissue_type", colortable=TISSUE_COLOR, umap_kwargs={'n_neighbors': 15,'min_dist': 0.2, 'metric': 'cosine'})
reducing dimensionality...
making plot...
Out[31]:
Figure(
id = '6215', …)
In [28]:
my_alligner.plot(rerun=False)
making plot...
Out[28]:
Figure(
id = '4816', …)
In [11]:
my_alligner.save('../temp/demo/')

adding another dataset to celligner

In [2]:
# you can load the dataset from gcp: (you can do so by hand or by installing gsutil)
# (make sure you have the right folder and then do:
# ! gsutil cp gs://ccle_default_params/cellinger_ex/model.pkl ../temp/demo/
my_alligner = Celligner()
my_alligner.load('../temp/demo/')
fetching gene names from biomart cache
using only usefull genes
In [ ]:
CCLF = tc.get()
In [ ]:
# if you want to align to both CCLE and TCGA, you can ask celligner to consider the two (fit + _pre-transformed_ transform datasets) as a fit dataset by calling:
# my_alligner.putAllToFit()

# you can add your dataset as a dataset to be aligned to, by puting it in fit:
# my_alligner.addToFit(yourdataset).transform()
# /!\ need to already have a transform dataset (if you loaded the example model, this is TCGA)

# you can add your dataset as one to align, by putting it in transform:
# my_alligner.addToTransform(yourdataset)
# /!\ need to already have a fit dataset (if you loaded the example model, this is CCLE)

# if your dataset is small enough it might actually not work well to put it in transform it seems!
# if your dataset is small and similar enough, you can add the parameter dotransform=False (or dofit=False) so that it doesn't fully retransforms or refit but uses cached computation instead.

my_alligner.addToFit().transform()
In [ ]:
my_alligner.plot(color_column="tissue_type")